Identify tags in airline database

Minimal code

- Read dataset
- transform data
- Minimal model
    - Embedings
    - Dense



In [ ]:

    
from __future__ import print_function

import os 
import numpy as np 

import tensorflow as tf 
print(tf.__version__)

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

#Show images
import matplotlib.pyplot as plt
%matplotlib inline
# plt configuration
plt.rcParams['figure.figsize'] = (10, 10)        # size of images
plt.rcParams['image.interpolation'] = 'nearest'  # show exact image
plt.rcParams['image.cmap'] = 'gray'  # use grayscale

Dataset

ATIS (Airline Travel Information System) dataset. Available in: https://github.com/mesnilgr/is13/blob/master/data/load.py

Example:

Input (words) show flights from Boston to New York today

Output (labels) O O O B-dept O B-arr I-arr B-date



In [ ]:

    
# Read data
import pickle
import sys

atis_file = '/home/ubuntu/data/training/text/atis/atis.pkl'
with open(atis_file,'rb') as f:
    if sys.version_info.major==2:
        train, test, dicts = pickle.load(f) #python2.7
    else:
        train, test, dicts = pickle.load(f, encoding='bytes') #python3

train / test sets:

- X: list of input sequences
- label: List of target labels asociated to each word in each sentence.

Dictionaries

- labels2idx:  To decode the labels
- words2idx: To decode the sentences



In [ ]:

    
#Dictionaries and train test partition
w2idx = dict()
for i in dicts[b'words2idx']:
    w2idx[i.decode("utf-8")] = dicts[b'words2idx'][i]

ne2idx = dict()
for i in dicts[b'tables2idx']:
    ne2idx[i.decode("utf-8")] = dicts[b'tables2idx'][i]

labels2idx = dict()
for i in dicts[b'labels2idx']:
    labels2idx[i.decode("utf-8")] = dicts[b'labels2idx'][i]    
idx2w  = dict((v,k) for k,v in w2idx.items())
idx2la = dict((v,k) for k,v in labels2idx.items())

train_x, _, train_label = train
test_x,  _,  test_label  = test


# Visualize data
wlength = 35
for e in ['train','test']:
    print(e)
    for sw, sl in zip(eval(e+'_x')[:2], eval(e+'_label')[:2]):
        print( 'WORD'.rjust(wlength), 'LABEL'.rjust(wlength))
        for wx, la in zip(sw, sl): print( idx2w[wx].rjust(wlength), idx2la[la].rjust(wlength))
        print( '\n'+'**'*30+'\n')



In [ ]:

    
#Select words for the label 48: b'B-fromloc.city_name' in train and test to check that are different:
for e in ['train','test']:
    print(e)
    print('---------')
    for sw, sl in zip(eval(e+'_x')[:5], eval(e+'_label')[:5]):
        for wx, la in zip(sw, sl): 
            if la==48:
                print( idx2w[wx])
    print('\n')

Data transformation

- Convert the list of sequences of words into an array of words x characteristics.
- The characteristics are the context of the word in the sentence.
    - For each word in the sentence, generate the context with the previous and the next words in the sentence.
    - For words at the beggining and the end, use padding to complete the context.



In [ ]:

    
# Max value of word coding to assign the ID_PAD
ID_PAD = np.max([np.max(tx) for tx in train_x]) + 1
print('ID_PAD: ', ID_PAD)

def context(l, size=3):
    l = list(l)
    lpadded = size // 2 * [ID_PAD] + l + size // 2 * [ID_PAD]
    out = [lpadded[i:(i + size)] for i in range(len(l))]
    return out

#Example
x = np.array([0, 1, 2, 3, 4], dtype=np.int32)
print('Context vectors: ', context(x))



In [ ]:

    
# Create train and test X y.
X_trn=[]
for s in train_x:
    X_trn += context(s,size=10)
X_trn = np.array(X_trn)

X_tst=[]
for s in test_x:
    X_tst += context(s,size=10)
X_tst = np.array(X_tst)

print('X trn shape: ', X_trn.shape)
print('X_tst shape: ',X_tst.shape)


y_trn=[]
for s in train_label:
    y_trn += list(s)
y_trn = np.array(y_trn)
print('y_trn shape: ',y_trn.shape)

y_tst=[]
for s in test_label:
    y_tst += list(s)
y_tst = np.array(y_tst)
print('y_tst shape: ',y_tst.shape)



In [ ]:

    
print('Num labels: ',len(set(y_trn)))
print('Num words: ',len(set(idx2w)))

First model

Architecture

- tf.nn.embedding_lookup
- Dense layer: tf.nn.relu(tf.matmul(x, W) + b)



In [ ]:

    
#General parameters
LOG_DIR = '/tmp/tensorboard/airline/embeddings/'

# data attributes
input_seq_length = X_trn.shape[1]
input_vocabulary_size = len(set(idx2w)) + 1
output_length = 127

#Model parameters
embedding_size=64



In [ ]:

    
# build the model: Simple LSTM with embedings

from tensorflow.contrib.keras import layers, models, optimizers

print('Build model 1')
seq_input = layers.Input(shape=([input_seq_length]), name='prev') 

#----------------------------------------
# Put your embedding layer here
#----------------------------------------

#----------------------------------------
# You need to do some transformation to connect the embedding out to the dense layer
#----------------------------------------

#----------------------------------------
# Put your final dense layer layer here
#----------------------------------------
output = 

model1 = models.Model(inputs=seq_input, outputs=output)
model1.summary()

# Optimizer
adam_optimizer = optimizers.Adam()
model1.compile(loss='sparse_categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])



In [ ]:

    
#Plot the model graph
from tensorflow.contrib.keras import utils

# Create model image
utils.plot_model(model1, '/tmp/model1.png')

# Show image
plt.imshow(plt.imread('/tmp/model1.png'))



In [ ]:

    
#Fit model
history = model1.fit(X_trn, y_trn, batch_size=128, epochs=10,
           validation_data=(X_tst, y_tst))



In [ ]:

    
#Plot graphs in the notebook output
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.show()



In [ ]:

    
# Predict. Score new paragraph 
def score_paragraph(paragraph):
    #Preprocess data
    p_w = paragraph.split()
    p_w_c = [w2idx[w] for w in  p_w]
    x_score = np.array(context(p_w_c, size=10))
    
    # Score
    pred_score = model1.predict(x_score)
    response = [idx2la[l] for l in np.argmax(pred_score,axis=1)]
    
    return response


paragraph = 'i need a business ticket in any flight with departure from alaska to las vegas monday with breakfast'
response = score_paragraph(paragraph)
wlength = 35
for wx, la in zip(paragraph.split(), response): print( wx.rjust(wlength), la.rjust(wlength))